{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "head: cannot open '../b-cari-com-my/data.jsonl' for reading: No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 ../b-cari-com-my/data.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "data = []\n",
    "with open('/home/husein/ssd3/b-cari-com-my/data.jsonl') as fopen:\n",
    "    for l in fopen:\n",
    "        data.append(json.loads(l))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "670540"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def simple(string):\n",
    "    string = re.sub('[^A-Za-z ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string.lower()\n",
    "\n",
    "def reject(k):\n",
    "    if 'saya tidak dapat menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah kandungan yang tidak sesuai' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah teks' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai makna' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak masuk akal' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan nampaknya tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak mempunyai maksud' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam mana-mana bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks ini tidak boleh diterjemahkan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah frasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak jelas' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan tidak' in k.lower():\n",
    "        return\n",
    "    if 'terjemahan teks kepada' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak boleh menterjemah bahasa' in k.lower():\n",
    "        return\n",
    "    if 'model bahasa AI' in k:\n",
    "        return\n",
    "    if 'bahasa melayu standard' in k.lower():\n",
    "        return\n",
    "    if 'return JSON structure' in k:\n",
    "        return\n",
    "    if 'teks yang diberikan bukan dalam bahasa' in k.lower():\n",
    "        return\n",
    "    if 'teks yang anda berikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'teks yang disediakan tidak' in k.lower():\n",
    "        return\n",
    "    if 'teks yang diberikan bukan' in k.lower():\n",
    "        return\n",
    "    if 'saya tidak dapat menterjemah' in k.lower():\n",
    "        return\n",
    "    if 'tetapi teks yang diberikan' in k.lower():\n",
    "        return\n",
    "    \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████| 670540/670540 [00:12<00:00, 51670.25it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "filtered = []\n",
    "for d in tqdm(data):\n",
    "    if d is None:\n",
    "        continue\n",
    "    if 'src' in d:\n",
    "        left = d['src']['text']\n",
    "        try:\n",
    "            en = d['r']['english']\n",
    "            ms = d['r']['malay']\n",
    "        except:\n",
    "            en = None\n",
    "            ms = None\n",
    "    else:\n",
    "        left = d[0]\n",
    "        success = False\n",
    "        try:\n",
    "            r = json.loads(d[1])\n",
    "            success = True\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "        if not success:\n",
    "            try:\n",
    "                r = eval(d[1])\n",
    "                success = True\n",
    "            except:\n",
    "                pass\n",
    "        \n",
    "        en = None\n",
    "        ms = None\n",
    "        \n",
    "        try:\n",
    "            if success:\n",
    "                if isinstance(r, tuple):\n",
    "                    en = [r_['english'] for r_ in r]\n",
    "                    ms = [r_['malay'] for r_ in r]\n",
    "                    en = ' '.join(en)\n",
    "                    ms = ' '.join(ms)\n",
    "                else:\n",
    "                    en = r['english']\n",
    "                    ms = r['malay']\n",
    "        except:\n",
    "            pass\n",
    "        \n",
    "    if en and not type(en) == str:\n",
    "        continue\n",
    "        \n",
    "    simple_left = simple(left)\n",
    "    \n",
    "    if (en and simple(en) == simple_left) or (ms and simple(ms) == simple_left):\n",
    "        continue\n",
    "    \n",
    "    if en or ms:\n",
    "        filtered.append({\n",
    "            'left': left,\n",
    "            'en': en,\n",
    "            'ms': ms\n",
    "        })           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "633420"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(633420, 670540)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered), len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('processed-b.cari.com.my.jsonl', 'w') as fopen:\n",
    "    for d in filtered:\n",
    "        try:\n",
    "            if not reject(d['ms']):\n",
    "                continue\n",
    "            fopen.write(f'{json.dumps(d)}\\n')\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "631727 processed-b.cari.com.my.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l processed-b.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"left\": \"bagusla mawi bagi peluang kat junior2 dia yang berbakat tak kira lelaki atau perempuan untuk tonjolkan bakat, bila appear dgn mawi ada la orang tertarik nak ambik lagi\", \"en\": \"It's good that Mawi gives opportunities to talented juniors regardless of gender to showcase their talents. When they appear with Mawi, there are people who are interested in taking them on again.\", \"ms\": \"Baguslah Mawi memberi peluang kepada junior-junior yang berbakat tanpa mengira jantina untuk menonjolkan bakat mereka. Apabila mereka muncul bersama Mawi, terdapat orang yang berminat untuk mengambil mereka lagi.\"}\r\n",
      "{\"left\": \"jd guru sebenarnya susah..bebanan tugasan guru sebenarnya anda belum rasa..semasa pratikal..tugas guru bukan stengah hari tp rasanya lebih 24 jm.. betul kata org tua anda guru memang susah naik pangkat..\", \"en\": \"Being a real teacher is actually difficult.. You haven't felt the burden of a real teacher's duties.. During practical training.. A teacher's job is not just half a day but feels like more than 24 hours.. Your elders were right, it's difficult for teachers to get promoted..\", \"ms\": \"Menjadi seorang guru sebenarnya sukar.. Anda belum merasai beban tugasan seorang guru sebenar.. Semasa latihan praktikal.. Tugas seorang guru bukanlah setengah hari tetapi terasa lebih dari 24 jam.. Benar kata orang tua anda, susah untuk guru naik pangkat..\"}\r\n",
      "{\"left\": \"macam nie pon ada kaa..\", \"en\": \"Is it like this also?\", \"ms\": \"Macam ni pun ada ke?\"}\r\n",
      "{\"left\": \"Kristen Stewart tu cantik tapi skill berlakon tu cacat. takda emosi la\", \"en\": \"Kristen Stewart is beautiful but her acting skills are flawed. She lacks emotions.\", \"ms\": \"Kristen Stewart cantik tetapi kemahiran lakonannya kurang baik. Dia kurang menunjukkan emosi.\"}\r\n",
      "{\"left\": \"dah beli D5000 tapi aper pun cuba ler buka web nie  http://www.dpreview.com/reviews/nikond5000\", \"en\": \"I have bought a D5000 but I am trying to open this website http://www.dpreview.com/reviews/nikond5000 but it's not working.\", \"ms\": \"Saya telah membeli D5000 tetapi saya cuba membuka laman web ini http://www.dpreview.com/reviews/nikond5000 tetapi ia tidak berfungsi.\"}\r\n",
      "{\"left\": \"cubala anda bayangkan anda dlam seituasi begini , dahulu anda ceria dan periang tetapi apabila maslaa ini dtg.. anda tidak mahu bergauh dan murung ..kerana semua akan palingkan muka apabila bercakap dgn anda. Sy terlalu ingin berjumpa dgn pakai mengenai maslaah ini tetapi tidak tahu bagaimana.\", \"en\": \"Imagine yourself in this situation, where you used to be cheerful and happy-go-lucky, but when this problem arose, you became withdrawn and gloomy. You don't want to socialize because you feel like everyone will turn away from you when you speak. You really want to meet with someone to discuss this issue, but you don't know how.\", \"ms\": \"Bayangkan diri anda berada dalam situasi ini, di mana anda dahulu ceria dan riang tetapi apabila masalah ini timbul, anda menjadi pendiam dan murung. Anda tidak mahu bergaul kerana anda rasa semua orang akan memalingkan muka apabila anda bercakap. Anda sangat ingin berjumpa dengan seseorang untuk membincangkan masalah ini, tetapi anda tidak tahu bagaimana.\"}\r\n",
      "{\"left\": \"salam  nak tanya ada tak sesiapa ada hantar kat tadika ni..nak tahu ok tak.  jika ada cadangan mana2 tadika ygg amalkan islamic educators..\", \"en\": \"Hi, may I ask if anyone has sent their child to this kindergarten before? I want to know if it's good. Also, if anyone has any recommendations for kindergartens that practice Islamic education, please let me know.\", \"ms\": \"Salam, boleh saya tanya ada sesiapa yang pernah hantar anak ke tadika ini sebelum ini? Saya ingin tahu sama ada ia baik atau tidak. Jika ada cadangan untuk tadika yang mengamalkan pendidikan Islam, tolong beritahu saya.\"}\r\n",
      "{\"left\": \"macam mana nk tahu serasi ke x???  cz mula2 rasa serasi but after 3-4 month start la ade masalah..\", \"en\": \"How to know if it's compatible or not??? Because at first it feels compatible but after 3-4 months problems start to arise..\", \"ms\": \"Bagaimana untuk mengetahui sama ada ia serasi atau tidak??? Kerana pada awalnya ia kelihatan serasi tetapi selepas 3-4 bulan masalah mula timbul..\"}\r\n",
      "{\"left\": \"8# khairimhd  tak pernah pulak dengar ubi nyaru.. mungkin panggilan lain kut.. biasa kalo pemes utk orang pompuan, selain dari kacip fatimah ialah bunga pokma (raflesia).. pastu ada gak kulat (tak ingat nama apa)..  dulu kalo ikut ayah masuk hutan.. dia tunjuk ler.. pokok nih utk nih.. pokok tu utk mende nih.. dulu, sebelum ayah jadi \\\"mafia\\\" FRU (sbb selalu orang pandang serong kat FRU).. ayah kerja PPH (Pasukan Polis Hutan).. kerjanya masuk hutan sampai berbulan bulan nak cari komunis.. masa tuh ler dia belajar khasiat herba2 hutan dari orang asli.. tapi anak2 cam amik tak amik tahu jer khasiat dari warisan purba..\", \"en\": \"I've never heard of ubi nyaru before. Maybe it has another name. Usually, for women, besides kacip fatimah, there's also bunga pokma (raflesia). And then there's also kulat (can't remember the name). When I used to go into the forest with my father, he would show me which tree was for what. This tree is for this, that tree is for that. Before my father became a 'mafia' in the FRU (because people always looked at the FRU suspiciously), he worked for the PPH (Forest Police Force). His job was to go into the forest for months to find communists. During that time, he learned about the medicinal properties of forest herbs from the indigenous people. But us kids just took it for granted and didn't really learn about the ancient heritage of these plants.\", \"ms\": \"Saya tidak pernah mendengar tentang ubi nyaru sebelum ini. Mungkin ia mempunyai nama lain. Biasanya, untuk wanita, selain daripada kacip fatimah, terdapat juga bunga pokma (raflesia). Dan kemudian ada juga kulat (tidak ingat namanya). Ketika saya dulu pergi ke dalam hutan dengan ayah saya, beliau akan menunjukkan pokok mana yang untuk apa. Pokok ini untuk ini, pokok itu untuk itu. Sebelum ayah saya menjadi 'mafia' dalam FRU (kerana orang selalu memandang FRU dengan curiga), beliau bekerja untuk PPH (Pasukan Polis Hutan). Tugasnya adalah untuk masuk ke dalam hutan selama berbulan-bulan untuk mencari komunis. Pada masa itu, beliau belajar tentang sifat perubatan herba hutan dari orang asli. Tetapi kami kanak-kanak hanya mengambilnya sebagai sesuatu yang biasa dan tidak benar-benar mempelajari warisan purba tumbuhan-tumbuhan ini.\"}\r\n",
      "{\"left\": \"saya jual spagetthi, pasta salad, mushroom sup, sandwich n sweet bun kat kiosk saya. baru mula berni ..\\nrinadini Post at 10-1-2012 11:02  huishh..  niaga pasta bagai..  space msti arr besar..  kiosk ak nie rase tecik jer..  sye bkn pminat pasta..plaster ada laa..huhuuh..  thanks in advanced..\", \"en\": \"I sell spaghetti, pasta salad, mushroom soup, sandwiches, and sweet buns at my kiosk. Just started my business..\", \"ms\": \"Saya menjual spaghetti, salad pasta, sup cendawan, sandwic dan roti manis di kiosk saya. Baru memulakan perniagaan..\"}\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 10 processed-b.cari.com.my.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
